How to characterize different groups of visitors? Is there a pattern in stations that visitor tends to visit?
library(tidyr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
##
## Następujące obiekty zostały zakryte z 'package:stats':
##
## filter, lag
##
## Następujące obiekty zostały zakryte z 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
library(reshape)
##
## Attaching package: 'reshape'
##
## Następujący obiekt został zakryty z 'package:lubridate':
##
## stamp
##
## Następujący obiekt został zakryty z 'package:dplyr':
##
## rename
##
## Następujący obiekt został zakryty z 'package:tidyr':
##
## expand
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
##
## Następujący obiekt został zakryty z 'package:dplyr':
##
## select
library(cluster)
library(pvclust)
library(dendextend)
##
## Welcome to dendextend version 1.1.2
##
## Type ?dendextend to access the overall documentation and
## browseVignettes(package = 'dendextend') for the package vignette.
## You can execute a demo of the package via: demo(dendextend)
##
## More information is available on the dendextend project web-site:
## https://github.com/talgalili/dendextend/
##
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
##
## To suppress the this message use:
## suppressPackageStartupMessages(library(dendextend))
##
##
## Attaching package: 'dendextend'
##
## Następujący obiekt został zakryty z 'package:dplyr':
##
## %>%
##
## Następujący obiekt został zakryty z 'package:tidyr':
##
## %>%
##
## Następujący obiekt został zakryty z 'package:stats':
##
## cutree
library(ape)
##
## Attaching package: 'ape'
##
## Następujące obiekty zostały zakryte z 'package:dendextend':
##
## ladderize, rotate
library(RColorBrewer)
library(scales)
library(colorspace) # get nice colors
library(plotly)
##
## Attaching package: 'plotly'
##
## Następujący obiekt został zakryty z 'package:dendextend':
##
## %>%
##
## Następujący obiekt został zakryty z 'package:ggplot2':
##
## last_plot
##
## Następujący obiekt został zakryty z 'package:graphics':
##
## layout
library(stringdist)
load("C:\\Users\\Karola\\Documents\\BISD\\Semestr 2\\Data Mining\\Projekt 2\\verySmallLogs.rda")
#Transforming data in the same way as in phase 1
data <- verySmallLogs %>%
mutate(station,
visitor,
type,
date,
weekday = wday(date, label=TRUE, abbr=FALSE),
hour = substr(date, 12, 13))
data$date <- as.POSIXct(data$date,format="%Y-%m-%d %H:%M:%S")
data = data[data$visitor != -1,]
dataEntering <- data[data$type=="Entering" & data$visitor != -1,]
dataLeaving <- data[data$type=="Leaving" & data$visitor != -1,]
newdataEntering = dataEntering %>%
group_by(visitor, station) %>%
summarise(min_date=min(date),
weekday=head(weekday,1),
hour=head(hour,1),
count = n())
newdataLeaving = dataLeaving %>%
group_by(visitor, station) %>%
summarise(max_date=max(date),
weekday=head(weekday,1),
hour=head(hour,1),
count = n())
mergedData <- merge(newdataEntering, newdataLeaving, by=c("visitor", "station"))
mergedData$time = as.numeric(mergedData$max_date-mergedData$min_date)
mergedData$hour = mergedData$hour.x
mergedData$weekday = mergedData$weekday.x
mergedData$count = mergedData$count.x
mergedData = mergedData[mergedData$time < 800,]
ggplot(data=mergedData,aes(count, time, station))+geom_point(aes(colour = station))
ggplot(data=mergedData,aes(weekday, count, station))+geom_point(aes(colour = station))
mat1 <- dist(mergedData[1:100,c("count","time")])
as.matrix(mat1)[1:5,1:5]
mat2 <- dist(scale(mergedData[1:100,c("count","time")]))
as.matrix(mat2)[1:5,1:5]
mat3 <- dist(scale(mergedData[1:100,c("count","time")]), method="manhattan")
as.matrix(mat3)[1:5,1:5]
dat <- scale(mergedData[,c("count","time")])
sampleData <- mergedData[sample(nrow(mergedData), 3000),]
head(sampleData)
sampleData <- transform(sampleData,
visitor = as.numeric(visitor),
station = as.numeric(station),
min_date = as.numeric(min_date),
max_date = as.numeric(max_date),
label = paste(station, visitor, sep="_"))
rownames(sampleData) <- sampleData$label
dat <- scale(sampleData[,c("count", "time", "station")])
ggplot(data=sampleData,aes(count, time, weekday))+geom_point(aes(colour = weekday))
ggplot(data=sampleData,aes(count, time, station))+geom_point(aes(colour = station))
hc <- agnes(dat, method="ward", metric = "manhattan")
dend <- as.dendrogram(hc)
dend <- color_branches(dend, k=3)
# plot(dend, horiz = TRUE, nodePar = list(cex = .007))
plot(cut(dend, h = 15)$upper, horiz = FALSE, cex=0.01)
# plot(hc, which.plots=2, cex=0.1, main="", xlab="")
sampleData$labels = factor(cutree(dend, k=3))
ggplot(sampleData, aes(count, time, label=station, color=labels))+geom_text(size=4)+theme_bw()
plot_ly(sampleData, x = count, y = time, z = station, type = "scatter3d", mode = "markers", color = labels)
GRUPA 3 -
dat <- scale(sampleData[,c("count", "time", "station")])
hc <- hclust(dist(dat, method = "manhattan"), "ward.D")
sampleData$labels = factor(cutree(hc, k=3))
ggplot(sampleData, aes(count, time, label=station, color=labels))+geom_text(size=4)+theme_bw()
plot_ly(sampleData, x = count, y = time, z = station, type = "scatter3d", mode = "markers", color = labels)
Add station to features
sampleData <- mergedData[sample(nrow(mergedData), 3000),]
sampleData <- transform(sampleData,
visitor = as.numeric(visitor),
station = as.numeric(station),
min_date = as.numeric(min_date),
max_date = as.numeric(max_date),
label = paste(station, visitor, sep="_"))
dat <- scale(sampleData[,c("count","time", "visitor", "min_date", "max_date", "station")])
hc <- hclust(dist(dat, method = "manhattan"), "ward.D")
#plot(hc, labels = FALSE)
sampleData$labels = factor(cutree(hc, k=3))
ggplot(sampleData, aes(count, time, label=station, color=labels))+geom_text(size=3)+theme_bw()
plot_ly(sampleData, x = count, y = time, z = station, type = "scatter3d", mode = "markers", color = labels)
This is how we have clustered our data for now (using 3 groups)
Try to get data by visitor (not by visitor, station) and introduce new features
Calculate new features
phase1Data <- mergedData[,c("visitor", "station", "max_date", "min_date", "time", "weekday", "hour", "count")]
phase1Data = phase1Data %>%
group_by(visitor) %>%
summarise(max_date=max(max_date),
min_date=min(min_date),
total_time=sum(time),
min_time=min(time),
max_time=max(time),
weekday=head(weekday,1),
hour=head(hour,1),
total_count = sum(count),
max_count = max(count),
min_count = min(count),
most_freq_station = head(station[which(count == max(count))],1),
least_freq_station = head(station[which(count == min(count))],1))
sampleData <- phase1Data[sample(nrow(phase1Data), 2000),]
sampleData <- sampleData[order(sampleData$visitor),]
rownames(sampleData) <- sampleData$visitor
kmeansData <- transform(sampleData,
visitor = as.numeric(visitor),
max_date = as.POSIXlt(max_date)$hour + as.POSIXlt(max_date)$min/60,
min_date = as.POSIXlt(min_date)$hour + as.POSIXlt(min_date)$min/60,
total_time = as.numeric(total_time),
min_time = as.numeric(min_time),
max_time = as.numeric(max_time),
#weekday = as.numeric(weekday),
hour = as.numeric(hour),
#most_freq_station = as.numeric(most_freq_station),
#least_freq_station = as.numeric(least_freq_station),
label = visitor)
kmeansData$max_date <- scale(kmeansData$max_date)
kmeansData$min_date <- scale(kmeansData$min_date)
kmeansData$total_time <- scale(kmeansData$total_time)
kmeansData$min_time <- scale(kmeansData$min_time)
kmeansData$max_time <- scale(kmeansData$max_time)
kmeansData$least_freq_station <- scale(kmeansData$total_count)
Plot a SPLOM: (how features depend of each other)
SPLOM_DATA <- kmeansData[,c("total_time", "max_time", "min_time","min_date", "max_date", "min_count", "max_count", "total_count")]
station_col <- rev(rainbow_hcl(65))[as.numeric(sampleData$most_freq_station)]
pairs(SPLOM_DATA, col = station_col,
lower.panel = NULL,
cex.labels=1, pch=15, cex = 0.75)
set.seed(4)
model1 <- kmeans(kmeansData[,c("total_time", "max_time", "min_time","min_date", "max_date", "min_count", "max_count", "total_count")], 4)
kmeansData$cluster <- factor(model1$cluster)
nd <- data.frame(model1$centers)
ggplot(kmeansData, aes(total_time, total_count)) +
geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
geom_point(data=nd, size=3)+
theme_bw()
plot_ly(kmeansData, x = max_time, y = total_count, z = total_time, type = "scatter3d", mode = "markers", color = cluster)
plot_ly(kmeansData, x = total_count, y = min_time, z = total_time, type = "scatter3d", mode = "markers", color = cluster)
plot_ly(kmeansData, x = total_count, y = min_date, z = total_time, type = "scatter3d", mode = "markers", color = cluster)
set.seed(4)
model1 <- kmeans(kmeansData[,c("total_time", "total_count")], 4)
kmeansData$cluster <- factor(model1$cluster)
nd <- data.frame(model1$centers)
ggplot(kmeansData, aes(total_time, total_count)) +
geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
geom_point(data=nd, size=3)+
theme_bw()
kmFirstGroup = kmeansData[kmeansData$cluster == 1,]
Long time of use and frequent interactions with machines
qplot(kmFirstGroup$most_freq_station)+geom_bar()
Machines which are used most frequent: 10, 05, 56, 66 Least which are used least frequent: 18, 20
kmSecondGroup = kmeansData[kmeansData$cluster == 2,]
Rather medium and short times of interactions, the number of interactions is small
qplot(kmSecondGroup$most_freq_station)+geom_bar()
Similar to first group, however ‘cnk18’ is used more frequent, ‘cnk19’ is used less frequent. ‘cnk20’ is the most frequently used machine. Members, who use machine for short time, chooses ‘cnk20’ machine more frequently than people who use machine for long time.
kmThirdGroup = kmeansData[kmeansData$cluster == 3,]
Result is similar to the result from second group.
qplot(kmThirdGroup$most_freq_station)+geom_bar()
kmFourthGroup = kmeansData[kmeansData$cluster == 4,]
Members who use machine only for short time and they do have small number of iterations
qplot(kmFourthGroup$most_freq_station)+geom_bar()
In most cases, members choose ‘19a’ machine, completely different than people who spend long times while using machines.
pamData <- transform(sampleData,
max_date = as.POSIXlt(max_date)$hour + as.POSIXlt(max_date)$min/60,
min_date = as.POSIXlt(min_date)$hour + as.POSIXlt(min_date)$min/60,
total_time = as.numeric(total_time),
min_time = as.numeric(min_time),
max_time = as.numeric(max_time), label = visitor)
model4 <- pam(pamData[,c("total_time", "total_count")], 4)
pamData$cluster <- factor(model4$clustering)
nd <- data.frame(model4$medoids)
ggplot(pamData, aes(total_time, total_count)) +
geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
# geom_point(data=nd, size=3)+
theme_bw()
# plot_ly(pamData, x = total_time, y = total_count, z = most_freq_station, type = "scatter3d", mode = "markers", color = cluster)
# plot_ly(pamData, x = total_time, y = hour, z = most_freq_station, type = "scatter3d", mode = "markers", color = cluster)
# plot_ly(pamData, x = total_count, y = min_date, z = weekday, type = "scatter3d", mode = "markers", color = cluster)
pc <- prcomp(SPLOM_DATA)
comp <- data.frame(pc$x[,1:4])
plot(comp, pch=16, col=rgb(0,0,0,0.5))
k <- kmeans(comp, 4, nstart=25, iter.max=1000)
plot(comp, col=k$clust, pch=16)
summary(pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 24.9358 7.5305 4.26549 1.3992 1.05410 0.54217
## Proportion of Variance 0.8881 0.0810 0.02599 0.0028 0.00159 0.00042
## Cumulative Proportion 0.8881 0.9691 0.99513 0.9979 0.99951 0.99993
## PC7 PC8
## Standard deviation 0.16366 0.14785
## Proportion of Variance 0.00004 0.00003
## Cumulative Proportion 0.99997 1.00000
biplot(pc)
patternData <- mergedData[,c("visitor", "station", "time", "count", "min_date")]
head(patternData)
## visitor station time count min_date
## 1 795453 cnk61 18 5 2012-01-03 09:53:00
## 2 795455 cnk10 88 12 2012-01-03 10:00:20
## 3 795455 cnk66 92 17 2012-01-03 09:58:29
## 4 795467 cnk10 34 4 2012-01-03 09:45:06
## 5 795476 cnk18 31 8 2012-01-03 09:41:09
## 6 795476 cnk38 82 21 2012-01-03 10:21:33
patternData = patternData %>%
arrange(min_date) %>%
group_by(visitor) %>%
summarise(total_time=sum(time),
total_count = sum(count),
first_station = head(station,1),
last_station = tail(station,1),
station_path = paste(station, collapse="_"),
most_freq_station = head(station[which(count == max(count))],1),
least_freq_station = head(station[which(count == min(count))],1))
sampleData <- patternData[sample(nrow(patternData), 2000),]
sampleData <- sampleData[order(sampleData$visitor),]
rownames(sampleData) <- sampleData$visitor
d <- stringdistmatrix(sampleData$station_path, sampleData$station_path)
cl <- hclust(as.dist(d))
#plot(cl)
sampleData$labels = factor(cutree(cl, k=3))
ggplot(sampleData, aes(total_count, total_time, label=most_freq_station, color=labels))+geom_text(size=3)+theme_bw()
ggplot(sampleData, aes(total_count, total_time, color=labels))+geom_point(size=2)+theme_bw()
firstGroup = (sampleData %>% filter(labels == 1))
qplot(firstGroup$most_freq_station)+geom_bar()
Use of machine is equally distributed
qplot(firstGroup$total_time)+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Domination of short times (below 500)
head(firstGroup[,c("station_path")])
## Source: local data frame [6 x 1]
##
## station_path
## (chr)
## 1 cnk61_cnk56
## 2 cnk66_cnk20_cnk05_cnk56
## 3 cnk18
## 4 cnk18
## 5 cnk05_cnk10_cnk66_cnk18
## 6 cnk10_cnk20_cnk18_cnk61
Mainly short processes, mostly 1 or 2 machines used
secondGroup = (sampleData %>% filter(labels == 2))
qplot(secondGroup$most_freq_station)+geom_bar()
Some machines, for example cnk19a are not used by users from the same group. On the other hand ‘cnk66’, ‘cnk05’ are used very frequently.
qplot(secondGroup$total_time)+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Total times are medium ones and are distrbitued with normal distribution (around 500)
head(secondGroup[,c("station_path")])
## Source: local data frame [6 x 1]
##
## station_path
## (chr)
## 1 cnk10_cnk05_cnk66_cnk20_cnk38_cnk56_cnk61
## 2 cnk66_cnk10_cnk20_cnk05_cnk18_cnk61_cnk56
## 3 cnk10_cnk05_cnk19a_cnk66_cnk20_cnk18_cnk61
## 4 cnk19a_cnk05_cnk66_cnk20_cnk38_cnk56
## 5 cnk19a_cnk10_cnk66_cnk20_cnk05_cnk18
## 6 cnk19a_cnk20_cnk66_cnk10_cnk18_cnk61
Users of this group are machines in random way. They start mostly on machines: ‘20’, ‘05’, ‘10’ and ends on ‘56’ or ‘38’. Also they use machines ‘66’, ‘61’, ‘18’, ‘20’ in random order.
thirdGroup = (sampleData %>% filter(labels == 3))
qplot(thirdGroup$most_freq_station)+geom_bar()
Users of this group uses machine more randomly, ‘cnk18’ is almost not used by group members. On the other hand cnk38 is used very often. Those people uses also cnk66 and cnk05.
qplot(thirdGroup$total_time)+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Total usage times are large (above 500)
head(thirdGroup[,c("station_path")])
## Source: local data frame [6 x 1]
##
## station_path
## (chr)
## 1 cnk20_cnk66_cnk61_cnk38_cnk18
## 2 cnk05_cnk66_cnk10_cnk20_cnk61_cnk56
## 3 cnk05_cnk10_cnk66_cnk18_cnk20_cnk56
## 4 cnk05_cnk10_cnk20_cnk66_cnk18_cnk38
## 5 cnk05_cnk66_cnk20_cnk18_cnk56_cnk38
## 6 cnk10_cnk05_cnk66_cnk18_cnk61
Group members starts mostly on ‘cnk10’ or ‘cnk05’ machine and then they play on ‘cnk66’, ‘cnk20’, ‘cnk18’ and ‘cnk61’ machines. At the end they finish on ‘cnk56 and ’cnk38’.
sampleData$labels = factor(cutree(cl, k=100))
Information for 3 groups
firstGroup = (sampleData %>% filter(labels == 1))
length(unique(firstGroup$station_path))
## [1] 62
length(firstGroup$station_path)
## [1] 511
unique(firstGroup$station_path)
## [1] "cnk61_cnk56" "cnk38_cnk18" "cnk10_cnk05" "cnk66_cnk20"
## [5] "cnk20_cnk61" "cnk20_cnk66" "cnk56_cnk05" "cnk18_cnk61"
## [9] "cnk05_cnk10" "cnk56_cnk61" "cnk66_cnk18" "cnk20_cnk56"
## [13] "cnk10_cnk66" "cnk05_cnk18" "cnk05_cnk56" "cnk10_cnk61"
## [17] "cnk05_cnk66" "cnk10_cnk56" "cnk20_cnk18" "cnk18_cnk56"
## [21] "cnk66_cnk61" "cnk05_cnk20" "cnk10_cnk20" "cnk18_cnk38"
## [25] "cnk18_cnk20" "cnk66_cnk56" "cnk38_cnk56" "cnk10_cnk18"
## [29] "cnk05_cnk61" "cnk20_cnk10" "cnk66_cnk38" "cnk20_cnk38"
## [33] "cnk61_cnk18" "cnk56_cnk10" "cnk18_cnk10" "cnk18_cnk66"
## [37] "cnk19a_cnk20" "cnk05_cnk19a" "cnk05_cnk38" "cnk19a_cnk18"
## [41] "cnk19a_cnk38" "cnk19a_cnk10" "cnk10_cnk19a" "cnk19a_cnk66"
## [45] "cnk19a_cnk61" "cnk61_cnk66" "cnk18_cnk19a" "cnk56_cnk19a"
## [49] "cnk66_cnk19a" "cnk19a_cnk56" "cnk66_cnk10" "cnk61_cnk19a"
## [53] "cnk61_cnk38" "cnk19a_cnk05" "cnk10_cnk38" "cnk18_cnk05"
## [57] "cnk20_cnk19a" "cnk38_cnk20" "cnk38_cnk61" "cnk38_cnk19a"
## [61] "cnk61_cnk20" "cnk38_cnk10"
secondGroup = (sampleData %>% filter(labels == 3))
length(unique(secondGroup$station_path))
## [1] 9
length(secondGroup$station_path)
## [1] 811
unique(secondGroup$station_path)
## [1] "cnk18" "cnk05" "cnk66" "cnk20" "cnk56" "cnk61" "cnk10" "cnk38"
## [9] "cnk19a"
thirdGroup = (sampleData %>% filter(labels == 7))
length(unique(thirdGroup$station_path))
## [1] 24
length(thirdGroup$station_path)
## [1] 60
unique(thirdGroup$station_path)
## [1] "cnk05_cnk10_cnk66" "cnk05_cnk20_cnk56" "cnk05_cnk10_cnk18"
## [4] "cnk05_cnk66_cnk38" "cnk05_cnk18_cnk61" "cnk05_cnk66_cnk18"
## [7] "cnk05_cnk20_cnk18" "cnk05_cnk10_cnk56" "cnk05_cnk66_cnk20"
## [10] "cnk05_cnk10_cnk61" "cnk05_cnk19a_cnk66" "cnk05_cnk19a_cnk18"
## [13] "cnk05_cnk19a_cnk56" "cnk05_cnk61_cnk56" "cnk05_cnk19a_cnk38"
## [16] "cnk05_cnk19a_cnk20" "cnk05_cnk19a_cnk61" "cnk05_cnk38_cnk56"
## [19] "cnk05_cnk20_cnk10" "cnk05_cnk66_cnk10" "cnk05_cnk20_cnk66"
## [22] "cnk05_cnk18_cnk66" "cnk05_cnk20_cnk38" "cnk05_cnk61_cnk18"
The population is heterogeneous.
Research carried out on the supplied data set showed that it is possible to distinguish four groups that show signs of similarities.
As an example, one of the group has following features. Team members uses machines for short time and total numer of used machines is relatively low. The other group members plays much longer then previous one. Moreover, one can determine a specific path of their usage (starting on specific machine, ending in some machine).
Histogram from the document shows that machines, which were used by people, who plays only few times are not used by team members which uses machine very frequently. To be more specific - number of usage of machine ‘19a’ is small, furthermore it is used by such players, not by addicted people.
Second conclusion is that there exists group, which uses machine only few times and then they give up (resign) due to fact, that they lose some amount of money or they just wanted to try.
Another fact is that another group can be indetified. Its members develop some schemas and strategies (using specific machines, specific ‘paths through machines’). Such members can be considered as ‘Addicts’. Machines which are used most frequent: 10, 05, 56, 66 Least which are used least frequent: 18, 20
From researcher point of view, it is possible to point out, that there exists group which uses machines occasionally - ‘Occasional players’. Frequency of playing is smaller than frequency of the aforementioned group. Usage of machines is similar to ‘Addicts’ group, however ‘cnk18’ is used more frequently, ‘cnk19’ is used less frequently. ‘cnk20’ is the most frequently used machine. Members, who use machine for short time, chooses ‘cnk20’ machine more frequently than people who use machine for long time.
Machine patterns: If we increase the number of classes, we will obtain 100 groups of people with very similar preferences, eg. single person whose behaviour is similar every time. In this way people are clustered together when: they have the same length of their path, visit similar machines and start from the same station.